# -*- coding: utf-8 -*-
import random
from scrapy import Request
from scrapy.utils.project import get_project_settings
from crecgec.rules import *
from zc_core.spiders.base import BaseSpider
from zc_core.dao.sku_pool_dao import SkuPoolDao
from zc_core.dao.batch_dao import BatchDao
from zc_core.util.done_filter import DoneFilter


class FullSpider(BaseSpider):
    name = 'full'

    item_url = 'https://mall.crecgec.com/search/goodsDetail/{}/{}/{}.html'

    def __init__(self, batchNo=None, *args, **kwargs):
        super(FullSpider, self).__init__(batchNo=batchNo, *args, **kwargs)
        # 创建批次记录
        BatchDao().create_batch(self.batch_no)
        # 避免重复采集
        self.done_filter = DoneFilter(self.batch_no)

    def start_requests(self):

        settings = get_project_settings()
        while_list = settings.get("CATALOG_WHITE_LIST")
        if while_list:
            pool_list = SkuPoolDao().get_sku_pool_list(fields={'_id': 1, 'batchNo': 1, 'offlineTime': 1,
                                                               'catalog1Id': 1, 'skuId': 1, 'catalog1Name': 1,
                                                               'catalog3Id': 1},
                                                       query={"$or": while_list})
        else:
            pool_list = SkuPoolDao().get_sku_pool_list(
                fields={'_id': 1, 'batchNo': 1, 'offlineTime': 1,
                        'catalog1Id': 1, 'skuId': 1, 'catalog1Name': 1, 'catalog3Id': 1})
        self.logger.info('全量：%s' % (len(pool_list)))
        random.shuffle(pool_list)
        for sku in pool_list:
            _id = sku.get('_id')
            catalog3Id = sku.get('catalog3Id')
            offline_time = sku.get('offlineTime', 0)
            settings = get_project_settings()
            if offline_time > settings.get('MAX_OFFLINE_TIME', 2):
                self.logger.info('忽略: [%s][%s]', sku, offline_time)
                continue
            # 避免重复采集
            if self.done_filter.contains(_id) and not settings.get('FORCE_RECOVER', False):
                self.logger.info('已采：[%s]', _id)
                continue
            # 采集商品
            yield Request(
                url=self.item_url.format(_id.split('_')[0], _id.split('_')[1], catalog3Id),
                callback=self.parse_content_data,
                errback=self.error_back,
                priority=260,
                meta={
                    'reqType': 'item',
                    'batchNo': self.batch_no,
                    'skuId': _id,
                    'catalog3Id': catalog3Id,
                },
            )

    # 处理ItemData
    def parse_content_data(self, response):
        # 处理商品详情页
        data = parse_item_data(response)
        if data:
            self.logger.info('商品: [%s]' % data.get('skuId'))
            yield data
